rm(list=ls(all=TRUE))
# setwd("....") set your working directory where the legislative speech texts are stored

# to fully replicate our results you need to have installed the following two packages:
# "quanteda.seededlda_0.2" and "quanteda_1.5.1"

# to install "quanteda.seededlda_0.2":
# devtools::install_github("koheiw/seededlda", ref = "ad0c345a6f05d3cfefd15f3f108a03d7653791b2")

# to install "quanteda_1.5.1":
# devtools::install_github("quanteda/quanteda", ref = "v1.5.1") 

# If you are using a quanteda version >2.0, and you want to avoid to install quanteda_1.5.1. (highly recommended),
# to replicate our results you can also simply load the "dfm.RData" Workspace.
# In this case, after having loaded such Workspace, you can avoid Step 1 and 2, and start the analysis directly since Step 3 below

require(quanteda)
require(readtext)
require(quanteda.seededlda)

####################################################
#### Step 1: open the texts
# IMPORTANT: the "zip_texts" archive file in Dataverse is a .rar file. Before running this script, please transform it from a .rar file to
# a .zip file. For example from here: https://www.convertfiles.com/convert/archive/RAR-to-ZIP.html
####################################################

myText <- readtext("zip_texts.zip", 
docvarsfrom = "filenames", dvsep = "_", docvarnames = c("Party", "Mission"), encoding = "UTF-8")
names(myText)[1] <- "Name"
myText$Name <- gsub(".txt", "", myText$Name )

####################################################
#### Step 2: clean texts, creating corpus and then DfM
####################################################

myText$text <- gsub("[\u0092]","'",myText$text)
myText$text <- gsub("[\u2019]","'",myText$text)
myText$text <- gsub("[\u00b4]","'",myText$text)
myText$text <- gsub("[\u00d5]","'",myText$text)
myText$text <- gsub("[\u017e]","'",myText$text)
myText$text <- gsub("[\u017d]","'",myText$text)
myText$text <- gsub("[\u02c6]","'",myText$text)
myText$text <- gsub("[\u00ca]","'",myText$text)
myText$text <- gsub("[\u02dc]","'",myText$text)
myText$text <- gsub("[\u00c7]","'",myText$text)
myText$text <- gsub("'"," ",myText$text)

myText$text <- gsub("�no�","no",myText$text)
myText$text <-  gsub("�no�","no",myText$text)
myText$text <-  gsub("�No�","no",myText$text)

meta.pr <- read.csv("meta_table.csv", 1)
meta.pr$Name <- as.character(meta.pr$Name )
meta.pr <- meta.pr[,-2]
names(meta.pr)[4] <- "Mission_name"

fit <- merge(myText, meta.pr, by = "Name")
names(fit)[5] <- "LR"
fit <- fit[,-1]
fit$Party <- as.factor(fit$Party)
fit$Mission <-as.factor(fit$Mission )

corpus <- corpus(fit)
docnames(corpus) <- meta.pr$Name

myDfm <- dfm(corpus, remove = c(stopwords("italian"), "l", "d", "dell", "dall", "afganistan", "libano", "kosovo", "iraq", "libia", "albania")
, tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE)

########################### 
### Step 3: define the word-seeds
########################### 

myDict <- dictionary(list(multilateralism= c("multilateralism", "comunit", "responsabilit", "alleanza", "alleati", "impegno", 
"sicurezza", "coalizione"),
   humanitarian_dimension= c("democrazia", "umani", "democrazia", "democratica", "diritto", "pace", "solidariet", "libert", "pacific*", 
"umanitaria", "umanitari", "solidal*"),
   war= c("guerra", "militare", "bombardamenti", "militari", "costituzione", "disarmo", "chiarezza", "violenza", "bombe", "rischi", 
"vittime")))

##########################
### Step 4: fitting the seeded LDA 
########################## 

set.seed(123)
slda <- textmodel_seededlda(myDfm, myDict, residual = TRUE)

# let's extract the coefficients for each topic across documents
multilateralism<- rep(NA, nrow(myText))
for (i in 1:104){
multilateralism[i] <- slda$lda@gamma[i,][1]
}

humanitarian_dimension <- rep(NA,  nrow(myText))
for (i in 1:104){
humanitarian_dimension [i] <- slda$lda@gamma[i,][2]
}

war <- rep(NA,  nrow(myText))
for (i in 1:104){
war  [i] <- slda$lda@gamma[i,][3]
}

fit_slda  <-fit[-c(1)]
fit_slda $multilateralism <- multilateralism
fit_slda $humanitarian_dimension<- humanitarian_dimension
fit_slda $war<- war

##########################
### Step 5: saving the results for the Stata analysis
########################## 

write.csv(fit_slda, "scores_slda.csv")
